Enrichment analysis - GO BP
Initialization
We start the analysis by initializing the packages required for all the analysis performed in this section. We also define the root directory, within which all the input/output operations for this project will be performed. At the end of this document, detailed software version information is provided for easier reproducibility of the analysis.
library(hypeR)
library(reshape2)
library(pheatmap)
library(RColorBrewer)
library(dynamicTreeCut)
library(DT)
path = "/Users/ashwin/Documents/Projects/YeastScreen/Nonessential_MATa_screen/"
normDat = readRDS(paste0(path, "data/workspaces/YeastMutantRedox_NormalizedData_RobustZ.RDS"))
normDat = normDat$redox_table
genesets = readRDS(paste0(path, "data/workspaces/YeastGeneSets.RDS"))Hypergeometric test
Here we write a simple hypergeometric test function to perform enrichment analysis. This function is based on the explainations provided on these three sources 1, 2 and 3
computeEnrichment = function(gset, genes, fdr_cutoff = 1, pval_cutoff = 1, sort_output = TRUE) {
# removing possible duplicated entries
gset = lapply(gset, unique)
genes = unique(genes)
# constructing the background population of genes having at least one functional
# annotation
univ = unique(unlist(gset))
# from the selection of genes, considering only those having at least one
# functional annotation
genes = genes[genes %in% univ]
# size of background population - i.e total number of genes with at least one
# functional annotation
N = length(univ)
# size of selection - total number of genes in our selection subset
k = length(genes)
# total number of genes in the functional group/pathway
m = sapply(gset, length)
# total number of genes NOT in the functional group/pathway
n = N - m
# total number of the selected genes in the functional group/pathway
x = sapply(gset, function(y) sum(genes %in% y))
# compiling final result
res = data.frame(matrix(ncol = 6, nrow = length(gset)), stringsAsFactors = F)
colnames(res) = c("pathway", "pval", "fdr", "foldEnrich", "overlapPercent", "overlapGenes")
res$pathway = names(gset)
res$pval = signif(phyper(q = x - 1, m = m, n = n, k = k, lower.tail = FALSE),
2)
res$fdr = signif(p.adjust(res$pval, method = "fdr"), 2)
res$foldEnrich = round(c((x/k)/(m/N)), 2)
res$overlapPercent = round(c(x/m * 100), 2)
res$overlapGenes = sapply(gset, function(x) paste(x[x %in% genes], collapse = "|"))
# selection of significant hits
res = res[res$pval <= pval_cutoff, ]
res = res[res$fdr <= fdr_cutoff, ]
# sort results
if (sort_output) {
res = res[order(res$foldEnrich, decreasing = T), ]
}
# delete variables
rm(N, k, m, n, x, univ, genes, gset)
# return function output
return(res)
}Enrichment analysis of top hits
Next we check the enrichment of the -
- mutants having the top and lowest 25% of roGFP2 ratios in cytoplasm with carbon source as glucose, galactose and glycerol
- mutants having the top and lowest 25% of roGFP2 ratios in mitochondria with carbon source as glucose, galactose and glycerol
across GOBP pathways
# Cytoplasm
cyto = normDat[normDat$Organelle == "Cytoplasm", ]
cyto = split(cyto, cyto$Nutrient)
cytoOX = lapply(cyto, function(x) {
genes = x$Genes
vals = x$Median_roGFP2_ratio
qt.top = quantile(vals, 0.75)
selgenes = unique(genes[vals > qt.top])
x = computeEnrichment(gset = genesets$GOBP, genes = selgenes)
# Exactly same as in hypeR package For sanity check of my own implemented
# function !!
# x = hypeR(signature = selgenes[selgenes %in% unique(unlist(genesets$GOBP))],
# gsets = genesets$GOBP, test = 'hypergeometric', bg =
# unique(unlist(genesets$GOBP)), pval_cutoff = 0.05, fdr_cutoff = 0.25, verbose =
# T)$data
return(x)
})
cytoRX = lapply(cyto, function(x) {
genes = x$Genes
vals = x$Median_roGFP2_ratio
qt.low = quantile(vals, 0.25)
selgenes = unique(genes[vals < qt.low])
x = computeEnrichment(gset = genesets$GOBP, genes = selgenes)
return(x)
})
# Mitochondria
mito = normDat[normDat$Organelle == "Mitochondria", ]
mito = split(mito, mito$Nutrient)
mitoOX = lapply(mito, function(x) {
genes = x$Genes
vals = x$Median_roGFP2_ratio
qt.top = quantile(vals, 0.75)
selgenes = unique(genes[vals > qt.top])
x = computeEnrichment(gset = genesets$GOBP, genes = selgenes)
return(x)
})
mitoRX = lapply(mito, function(x) {
genes = x$Genes
vals = x$Median_roGFP2_ratio
qt.low = quantile(vals, 0.25)
selgenes = unique(genes[vals < qt.low])
x = computeEnrichment(gset = genesets$GOBP, genes = selgenes)
return(x)
})
rm(cyto, mito)We construct a table containing the enrichment analysis results across all organelle and nutrient conditions.
combo_enrichmat = rbind(cbind(cytoOX$Glucose, Organelle = "Cytoplasm", Nutrient = "Glucose", roGFP2 = "High"),
cbind(cytoOX$Galactose, Organelle = "Cytoplasm", Nutrient = "Galactose", roGFP2 = "High"),
cbind(cytoOX$Glycerol, Organelle = "Cytoplasm", Nutrient = "Glycerol", roGFP2 = "High"),
cbind(mitoOX$Glucose, Organelle = "Mitochondria", Nutrient = "Glucose", roGFP2 = "High"),
cbind(mitoOX$Galactose, Organelle = "Mitochondria", Nutrient = "Galactose", roGFP2 = "High"),
cbind(mitoOX$Glycerol, Organelle = "Mitochondria", Nutrient = "Glycerol", roGFP2 = "High"),
cbind(cytoRX$Glucose, Organelle = "Cytoplasm", Nutrient = "Glucose", roGFP2 = "Low"),
cbind(cytoRX$Galactose, Organelle = "Cytoplasm", Nutrient = "Galactose", roGFP2 = "Low"),
cbind(cytoRX$Glycerol, Organelle = "Cytoplasm", Nutrient = "Glycerol", roGFP2 = "Low"),
cbind(mitoRX$Glucose, Organelle = "Mitochondria", Nutrient = "Glucose", roGFP2 = "Low"),
cbind(mitoRX$Galactose, Organelle = "Mitochondria", Nutrient = "Galactose", roGFP2 = "Low"),
cbind(mitoRX$Glycerol, Organelle = "Mitochondria", Nutrient = "Glycerol", roGFP2 = "Low")
)
combo_enrichmat = combo_enrichmat[,c(1:5,7,8,9,6)]
datatable(combo_enrichmat, rownames = FALSE, filter = 'top', class='compact',
extensions = c('Buttons') ,
options = list( autoWidth = TRUE,
dom = 'Blfrtip',
buttons = c('csv', 'excel')
)) %>% formatStyle("pathway","white-space"="nowrap")Visualization of enrichment analysis
We plot the following heatmaps highlighting -
- A binary matrix of the enrichment p values (fdr pvalue < 0.25 = 1 and fdr pvalue > 0.25 = 0) across all carbon sources and organelles is created. (clustering of rows by binary distance)
- Fold enrichment over background (clustering of rows by euclidean distance)
- Fraction overlap (clustering of rows by euclidean distance)
All clustering is done by ward.D2 method. We used the Hybrid Adaptive Tree Cutting of dendrograms provided in dynamicTreeCut package to identify optimal number of clusters. The associated paper describing the method can be found here. Below is the heatmap showing the enrichment fdr p values (shown in purple hues, darker color corresponds to lower pvalue, fdr pvalue > 0.25 is shown in white).
# Creating a common matrix with all the enrichment pvalues
# Rows are pathways, columns are all combinations of nutrient, organelle and roGFP2 states
redoxEnrichMat = dcast(pathway ~ Nutrient + Organelle + roGFP2,
data = combo_enrichmat[, c(1, 3, 6:8)],
value.var = "fdr")
rownames(redoxEnrichMat) = redoxEnrichMat$pathway
redoxEnrichMat = redoxEnrichMat[, -1]
col_order = c(
"Glucose_Mitochondria_High",
"Galactose_Mitochondria_High",
"Glycerol_Mitochondria_High",
"Glucose_Cytoplasm_High",
"Galactose_Cytoplasm_High",
"Glycerol_Cytoplasm_High",
"Glucose_Mitochondria_Low",
"Galactose_Mitochondria_Low",
"Glycerol_Mitochondria_Low",
"Glucose_Cytoplasm_Low",
"Galactose_Cytoplasm_Low",
"Glycerol_Cytoplasm_Low"
)
redoxEnrichMat = redoxEnrichMat[, match(col_order, colnames(redoxEnrichMat))]
rm(col_order)
# Setting all FDR > 0.25 to NA and removing rows with all NA values
redoxEnrichMat[redoxEnrichMat > 0.25] = NA
na_cnt = apply(redoxEnrichMat, 1, function(x) {
sum(is.na(x))
})
redoxEnrichMat = redoxEnrichMat[na_cnt < 12,]
# Performing a binary clustering on the rows of the enrichment matrix
# Binary matrix : FDR < 0.25 = 1 and FDR > 0.25 = 0
# We will use this clustering for the heatmap
redoxEnrichMat.binary = redoxEnrichMat
redoxEnrichMat.binary[is.na(redoxEnrichMat.binary)] = 0
redoxEnrichMat.binary[redoxEnrichMat.binary > 0] = 1
d = dist(redoxEnrichMat.binary, method = "binary")
h = hclust(d, method = "ward.D2")
# Identifying robust clusters using dynamic tree cutting
# We will label the rows according to these clusters
dct = cutreeHybrid(
dendro = h,
distM = as.matrix(d),
minClusterSize = 5,
verbose = 0
)
anno_rows = data.frame(Clusters = dct$labels)
rownames(anno_rows) = h$labels
# Changing the levels order such that the annotations colors and cluster bars are in same order
anno_rows_sort = anno_rows$Clusters[h$order]
level_order = unique(anno_rows_sort)
anno_rows$Clusters = factor(anno_rows$Clusters, levels = level_order)
# Find row gaps corresponding to clusters
#row_gaps = sapply(split(1:length(anno_rows_sort), anno_rows_sort), function(x)x[length(x)])
#row_gaps = row_gaps[match(levels(anno_rows$Clusters), names(row_gaps))]
#row_gaps = row_gaps[-length(row_gaps)]
# Setting the annotation order and naming for the columns
anno_columns = data.frame(
Nutrient = rep(c("Glucose", "Galactose", "Glycerol"), 4),
Organelle = c(
rep("Mitochondria", 3),
rep("Cytoplasm", 3),
rep("Mitochondria", 3),
rep("Cytoplasm", 3)
),
roGFP2 = c(rep("High", 6), rep("Low", 6))
)
rownames(anno_columns) = colnames(redoxEnrichMat)
# Setting all the column and row annotation colors
col.carbon = brewer.pal(3, "Dark2")
names(col.carbon) = c("Glucose", "Galactose", "Glycerol")
col.organelle = brewer.pal(5, "Dark2")[4:5]
names(col.organelle) = c("Mitochondria", "Cytoplasm")
col.cluster = brewer.pal(length(levels(anno_rows$Clusters)), "Paired")
names(col.cluster) = as.character(levels(anno_rows$Clusters))
anno_color = list(
Nutrient = col.carbon,
Organelle = col.organelle,
roGFP2 = c(High = "black", Low = "grey"),
Clusters = col.cluster
)
# Setting the color for the heatmap color key
color.strip = rev(colorRampPalette(brewer.pal(9, "Purples"))(6))
color.strip[length(color.strip)] = "#FFFFFF"
# Heatmap
pheatmap(
redoxEnrichMat,
cluster_cols = F,
clustering_distance_rows = d,
clustering_method = "ward.D2",
#cutree_rows = 8,
breaks = seq(0, 0.30, 0.05),
fontsize = 8,
border_color = "grey90",
na_col = "white",
show_colnames = F,
gaps_col = c(3, 6, 9),
#gaps_row = row_gaps,
color = color.strip,
annotation_col = anno_columns,
annotation_row = anno_rows,
annotation_colors = anno_color
)Below is the heatmap showing the enrichment fold change over background. This is defined as -
Nbe the total number of genes with at least one functional annotation (unique set of genes from all KEGG pathways),kbe the total number of genes in our selection subset (say mutants with top 25% roGFP2 ratio)mbe the total number of genes in the functional group/pathway (say TCA cycle)xbe the the number ofkinm, i.e. total number of genes common in our selection and the selected functional pathway
then, fold enrichment over background is given as \[ \frac{\frac{x} {k}} {\frac{m}{N}} \]
# Creating a common matrix with all the enrichment fold change Rows are pathways,
# columns are all combinations of nutrient, organelle and roGFP2 states
redoxEnrichMat = dcast(pathway ~ Nutrient + Organelle + roGFP2, data = combo_enrichmat[,
c(1, 4, 6:8)], value.var = "foldEnrich")
rownames(redoxEnrichMat) = redoxEnrichMat$pathway
redoxEnrichMat = redoxEnrichMat[, -1]
col_order = c("Glucose_Mitochondria_High", "Galactose_Mitochondria_High", "Glycerol_Mitochondria_High",
"Glucose_Cytoplasm_High", "Galactose_Cytoplasm_High", "Glycerol_Cytoplasm_High",
"Glucose_Mitochondria_Low", "Galactose_Mitochondria_Low", "Glycerol_Mitochondria_Low",
"Glucose_Cytoplasm_Low", "Galactose_Cytoplasm_Low", "Glycerol_Cytoplasm_Low")
redoxEnrichMat = redoxEnrichMat[, match(col_order, colnames(redoxEnrichMat))]
rm(col_order)
# Row clustering
d = dist(redoxEnrichMat, method = "euclidean")
h = hclust(d, method = "ward.D2")
# Identifying robust clusters using dynamic tree cutting We will label the rows
# according to these clusters
dct = cutreeHybrid(dendro = h, distM = as.matrix(d), minClusterSize = 5, verbose = 0)
anno_rows = data.frame(Clusters = dct$labels)
rownames(anno_rows) = h$labels
# Changing the levels order such that the annotations colors and cluster bars are
# in same order
anno_rows_sort = anno_rows$Clusters[h$order]
level_order = unique(anno_rows_sort)
anno_rows$Clusters = factor(anno_rows$Clusters, levels = level_order)
# Setting the annotation order and naming for the columns
anno_columns = data.frame(Nutrient = rep(c("Glucose", "Galactose", "Glycerol"), 4),
Organelle = c(rep("Mitochondria", 3), rep("Cytoplasm", 3), rep("Mitochondria",
3), rep("Cytoplasm", 3)), roGFP2 = c(rep("High", 6), rep("Low", 6)))
rownames(anno_columns) = colnames(redoxEnrichMat)
# Setting all the column and row annotation colors
col.carbon = brewer.pal(3, "Dark2")
names(col.carbon) = c("Glucose", "Galactose", "Glycerol")
col.organelle = brewer.pal(5, "Dark2")[4:5]
names(col.organelle) = c("Mitochondria", "Cytoplasm")
col.cluster = brewer.pal(length(levels(anno_rows$Clusters)), "Paired")
names(col.cluster) = as.character(levels(anno_rows$Clusters))
anno_color = list(Nutrient = col.carbon, Organelle = col.organelle, roGFP2 = c(High = "black",
Low = "grey"), Clusters = col.cluster)
# Setting the color for the heatmap color key
brk = pretty(0:round(max(redoxEnrichMat)))
color.strip = colorRampPalette(brewer.pal(9, "YlOrRd"))(c(length(brk) - 1))
# Heatmap
pheatmap(redoxEnrichMat, cluster_cols = F, clustering_distance_rows = d, clustering_method = "ward.D2",
fontsize = 8, border_color = "white", na_col = "white", show_colnames = F, gaps_col = c(3,
6, 9), color = color.strip, annotation_col = anno_columns, annotation_row = anno_rows,
annotation_colors = anno_color)Below is the heatmap showing the simple fraction overlap between the gene sets and the selected genes. This is defined as -
kbe the total number of genes in our selection subset (say mutants with top 25% roGFP2 ratio)mbe the total number of genes in the functional group/pathway (say TCA cycle)xbe the the number ofkinm, i.e. total number of genes common in our selection and the selected functional pathway
then, fraction overlap is simply \[ \frac{x} {m} \times 100 \]
# Creating a common matrix with all the enrichment fold change Rows are pathways,
# columns are all combinations of nutrient, organelle and roGFP2 states
redoxEnrichMat = dcast(pathway ~ Nutrient + Organelle + roGFP2, data = combo_enrichmat[,
c(1, 5, 6:8)], value.var = "overlapPercent")
rownames(redoxEnrichMat) = redoxEnrichMat$pathway
redoxEnrichMat = redoxEnrichMat[, -1]
col_order = c("Glucose_Mitochondria_High", "Galactose_Mitochondria_High", "Glycerol_Mitochondria_High",
"Glucose_Cytoplasm_High", "Galactose_Cytoplasm_High", "Glycerol_Cytoplasm_High",
"Glucose_Mitochondria_Low", "Galactose_Mitochondria_Low", "Glycerol_Mitochondria_Low",
"Glucose_Cytoplasm_Low", "Galactose_Cytoplasm_Low", "Glycerol_Cytoplasm_Low")
redoxEnrichMat = redoxEnrichMat[, match(col_order, colnames(redoxEnrichMat))]
rm(col_order)
# Row clustering
d = dist(redoxEnrichMat, method = "euclidean")
h = hclust(d, method = "ward.D2")
# Identifying robust clusters using dynamic tree cutting We will label the rows
# according to these clusters
dct = cutreeHybrid(dendro = h, distM = as.matrix(d), minClusterSize = 5, verbose = 0)
anno_rows = data.frame(Clusters = dct$labels)
rownames(anno_rows) = h$labels
# Changing the levels order such that the annotations colors and cluster bars are
# in same order
anno_rows_sort = anno_rows$Clusters[h$order]
level_order = unique(anno_rows_sort)
anno_rows$Clusters = factor(anno_rows$Clusters, levels = level_order)
# Setting the annotation order and naming for the columns
anno_columns = data.frame(Nutrient = rep(c("Glucose", "Galactose", "Glycerol"), 4),
Organelle = c(rep("Mitochondria", 3), rep("Cytoplasm", 3), rep("Mitochondria",
3), rep("Cytoplasm", 3)), roGFP2 = c(rep("High", 6), rep("Low", 6)))
rownames(anno_columns) = colnames(redoxEnrichMat)
# Setting all the column and row annotation colors
col.carbon = brewer.pal(3, "Dark2")
names(col.carbon) = c("Glucose", "Galactose", "Glycerol")
col.organelle = brewer.pal(5, "Dark2")[4:5]
names(col.organelle) = c("Mitochondria", "Cytoplasm")
col.cluster = brewer.pal(length(levels(anno_rows$Clusters)), "Paired")
names(col.cluster) = as.character(levels(anno_rows$Clusters))
anno_color = list(Nutrient = col.carbon, Organelle = col.organelle, roGFP2 = c(High = "black",
Low = "grey"), Clusters = col.cluster)
# Setting the color for the heatmap color key
brk = pretty(0:round(max(redoxEnrichMat)))
color.strip = colorRampPalette(brewer.pal(9, "BuGn"))(c(length(brk) - 1))
# Heatmap
pheatmap(redoxEnrichMat, cluster_cols = F, clustering_distance_rows = d, clustering_method = "ward.D2",
breaks = brk, fontsize = 8, border_color = "white", na_col = "white", show_colnames = F,
gaps_col = c(3, 6, 9), color = color.strip, annotation_col = anno_columns, annotation_row = anno_rows,
annotation_colors = anno_color)Session information
R version 3.6.2 (2019-12-12)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
Running under: macOS 10.16
Matrix products: default
BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] DT_0.12 dynamicTreeCut_1.63-1 RColorBrewer_1.1-2
[4] pheatmap_1.0.12 reshape2_1.4.3 hypeR_1.2.0
[7] rmdformats_0.3.6 knitr_1.28
loaded via a namespace (and not attached):
[1] Rcpp_1.0.3 visNetwork_2.0.9 assertthat_0.2.1 digest_0.6.23
[5] ggforce_0.3.1 mime_0.9 R6_2.4.1 plyr_1.8.5
[9] evaluate_0.14 httr_1.4.1 ggplot2_3.2.1 pillar_1.4.3
[13] rlang_0.4.4 lazyeval_0.2.2 rstudioapi_0.11 rmarkdown_2.1
[17] webshot_0.5.2 readr_1.3.1 stringr_1.4.0 htmlwidgets_1.5.1
[21] igraph_1.2.4.2 polyclip_1.10-0 munsell_0.5.0 shiny_1.4.0
[25] compiler_3.6.2 httpuv_1.5.5 xfun_0.12 pkgconfig_2.0.3
[29] htmltools_0.4.0 tidyselect_1.0.0 tibble_2.1.3 bookdown_0.17
[33] fansi_0.4.1 viridisLite_0.3.0 crayon_1.3.4 dplyr_0.8.4
[37] later_1.0.0 MASS_7.3-51.5 grid_3.6.2 jsonlite_1.6.1
[41] xtable_1.8-4 gtable_0.3.0 lifecycle_0.1.0 magrittr_1.5
[45] formatR_1.7 scales_1.1.0 zip_2.0.4 cli_2.0.1
[49] stringi_1.4.5 farver_2.0.3 msigdbr_7.0.1 promises_1.1.0
[53] xml2_1.2.2 vctrs_0.2.2 gh_1.1.0 openxlsx_4.1.4
[57] kableExtra_1.1.0 tools_3.6.2 glue_1.3.1 tweenr_1.0.1
[61] purrr_0.3.3 hms_0.5.3 crosstalk_1.0.0 fastmap_1.0.1
[65] yaml_2.2.1 colorspace_1.4-1 rvest_0.3.5